During the Spring 2020 offering of DIYtranscriptomics, we analyzed a subset of patients and healthy controls from Amorim et al., 2019. This reproducible and dynamic report was created using Rmarkdown and the Knitr package, and summarizes the basic code and outputs (plots, tables, etc) produced during the course.
A variety of R packages was used for this analysis. All graphics and data wrangling were handled using the tidyverse suite of packages. All packages used are available from the Comprehensive R Archive Network (CRAN), Bioconductor.org, or Github.
Raw reads were mapped to the human reference transcriptome using Kallisto, version 0.46.2. The quality of raw reads, as well as the results of Kallisto mapping are summarized in this summary report generated using fastqc and multiqc.
After read mapping with Kallisto, TxImport was used to read kallisto outputs into the R environment. Annotation data from Biomart was used to summarize data from transcript-level to gene-level.
library(tidyverse) # provides access to Hadley Wickham's collection of R packages for data science
library(tximport) # package for getting Kallisto results into R
library(ensembldb) # helps deal with ensembl
library(EnsDb.Hsapiens.v86) # replace with your organism-specific database package
targets <- read_tsv("../3-read_mapping_Kallisto/test/studydesign.txt") # read in your study design
path <- file.path("../3-read_mapping_Kallisto/test", targets$sample, "abundance.tsv") # set file paths to your mapped data
Tx <- transcripts(EnsDb.Hsapiens.v86, columns=c("tx_id", "gene_name"))
Tx <- as_tibble(Tx)
Tx <- dplyr::rename(Tx, target_id = tx_id)
Tx <- dplyr::select(Tx, "target_id", "gene_name")
Txi_gene <- tximport(path,
type = "kallisto",
tx2gene = Tx,
txOut = FALSE, # determines whether your data represented at transcript or gene level
countsFromAbundance = "lengthScaledTPM",
ignoreTxVersion = TRUE)library(tidyverse)
library(edgeR)
library(matrixStats)
library(cowplot)
sampleLabels <- targets$sample
myDGEList <- DGEList(Txi_gene$counts)
log2.cpm <- cpm(myDGEList, log=TRUE)
log2.cpm.df <- as_tibble(log2.cpm, rownames = "geneID")
colnames(log2.cpm.df) <- c("geneID", sampleLabels)
log2.cpm.df.pivot <- pivot_longer(log2.cpm.df, # dataframe to be pivoted
cols = HS01:CL13, # column names to be stored as a SINGLE variable
names_to = "samples", # name of that new variable (column)
values_to = "expression") # name of new variable (column) storing all the values (data)
p1 <- ggplot(log2.cpm.df.pivot) +
aes(x=samples, y=expression, fill=samples) +
geom_violin(trim = FALSE, show.legend = FALSE) +
stat_summary(fun = "median",
geom = "point",
shape = 95,
size = 10,
color = "black",
show.legend = FALSE) +
labs(y="log2 expression", x = "sample",
title="Log2 Counts per Million (CPM)",
subtitle="unfiltered, non-normalized",
caption=paste0("produced on ", Sys.time())) +
theme_bw()
cpm <- cpm(myDGEList)
keepers <- rowSums(cpm>1)>=5 #user defined
myDGEList.filtered <- myDGEList[keepers,]
log2.cpm.filtered <- cpm(myDGEList.filtered, log=TRUE)
log2.cpm.filtered.df <- as_tibble(log2.cpm.filtered, rownames = "geneID")
colnames(log2.cpm.filtered.df) <- c("geneID", sampleLabels)
log2.cpm.filtered.df.pivot <- pivot_longer(log2.cpm.filtered.df, # dataframe to be pivoted
cols = HS01:CL13, # column names to be stored as a SINGLE variable
names_to = "samples", # name of that new variable (column)
values_to = "expression") # name of new variable (column) storing all the values (data)
p2 <- ggplot(log2.cpm.filtered.df.pivot) +
aes(x=samples, y=expression, fill=samples) +
geom_violin(trim = FALSE, show.legend = FALSE) +
stat_summary(fun = "median",
geom = "point",
shape = 95,
size = 10,
color = "black",
show.legend = FALSE) +
labs(y="log2 expression", x = "sample",
title="Log2 Counts per Million (CPM)",
subtitle="filtered, non-normalized",
caption=paste0("produced on ", Sys.time())) +
theme_bw()
myDGEList.filtered.norm <- calcNormFactors(myDGEList.filtered, method = "TMM")
log2.cpm.filtered.norm <- cpm(myDGEList.filtered.norm, log=TRUE)
log2.cpm.filtered.norm.df <- as_tibble(log2.cpm.filtered.norm, rownames = "geneID")
colnames(log2.cpm.filtered.norm.df) <- c("geneID", sampleLabels)
log2.cpm.filtered.norm.df.pivot <- pivot_longer(log2.cpm.filtered.norm.df, # dataframe to be pivoted
cols = HS01:CL13, # column names to be stored as a SINGLE variable
names_to = "samples", # name of that new variable (column)
values_to = "expression") # name of new variable (column) storing all the values (data)
p3 <- ggplot(log2.cpm.filtered.norm.df.pivot) +
aes(x=samples, y=expression, fill=samples) +
geom_violin(trim = FALSE, show.legend = FALSE) +
stat_summary(fun = "median",
geom = "point",
shape = 95,
size = 10,
color = "black",
show.legend = FALSE) +
labs(y="log2 expression", x = "sample",
title="Log2 Counts per Million (CPM)",
subtitle="filtered, TMM normalized",
caption=paste0("produced on ", Sys.time())) +
theme_bw()
plot_grid(p1, p2, p3, labels = c('A', 'B', 'C'), label_size = 12)Filtering was carried out to remove lowly expressed genes. Genes with less than 1 count per million (CPM) in at least 5 or more samples filtered out. This reduced the number of genes from 35643 to 15944.
library(tidyverse)
library(DT)
library(gt)
library(plotly)
mydata.df <- mutate(log2.cpm.filtered.norm.df,
healthy.AVG = (HS01 + HS02 + HS03 + HS04 + HS05)/5,
disease.AVG = (CL08 + CL10 + CL11 + CL12 + CL13)/5,
#now make columns comparing each of the averages above that you're interested in
LogFC = (disease.AVG - healthy.AVG)) %>% #note that this is the first time you've seen the 'pipe' operator
mutate_if(is.numeric, round, 2)
datatable(mydata.df[,c(1,12:14)],
extensions = c('KeyTable', "FixedHeader"),
filter = 'top',
options = list(keys = TRUE,
searchHighlight = TRUE,
pageLength = 10,
lengthMenu = c("10", "25", "50", "100")))The table shown below includes expression data for 15944 genes. You can sort and search the data directly from the table.
group <- targets$group
group <- factor(group)
pca.res <- prcomp(t(log2.cpm.filtered.norm), scale.=F, retx=T)
pc.var<-pca.res$sdev^2 # sdev^2 captures these eigenvalues from the PCA result
pc.per<-round(pc.var/sum(pc.var)*100, 1)
pca.res.df <- as_tibble(pca.res$x)
pca.plot <- ggplot(pca.res.df) +
aes(x=PC1, y=PC2, label=sampleLabels, color = group) +
geom_point(size=4) +
stat_ellipse() +
xlab(paste0("PC1 (",pc.per[1],"%",")")) +
ylab(paste0("PC2 (",pc.per[2],"%",")")) +
labs(title="PCA plot",
caption=paste0("produced on ", Sys.time())) +
coord_fixed() +
theme_bw()
ggplotly(pca.plot)library(tidyverse)
library(limma)
library(edgeR)
library(gt)
library(DT)
library(plotly)
group <- factor(targets$group)
design <- model.matrix(~0 + group)
colnames(design) <- levels(group)
v.DEGList.filtered.norm <- voom(myDGEList.filtered.norm, design, plot = FALSE)
fit <- lmFit(v.DEGList.filtered.norm, design)
contrast.matrix <- makeContrasts(infection = disease - healthy,
levels=design)
fits <- contrasts.fit(fit, contrast.matrix)
ebFit <- eBayes(fits)
myTopHits <- topTable(ebFit, adjust ="BH", coef=1, number=40000, sort.by="logFC")
myTopHits.df <- myTopHits %>%
as_tibble(rownames = "geneID")
vplot <- ggplot(myTopHits.df) +
aes(y=-log10(adj.P.Val), x=logFC, text = paste("Symbol:", geneID)) +
geom_point(size=2) +
geom_hline(yintercept = -log10(0.01), linetype="longdash", colour="grey", size=1) +
geom_vline(xintercept = 1, linetype="longdash", colour="#BE684D", size=1) +
geom_vline(xintercept = -1, linetype="longdash", colour="#2C467A", size=1) +
#annotate("rect", xmin = 1, xmax = 12, ymin = -log10(0.01), ymax = 7.5, alpha=.2, fill="#BE684D") +
#annotate("rect", xmin = -1, xmax = -12, ymin = -log10(0.01), ymax = 7.5, alpha=.2, fill="#2C467A") +
labs(title="Volcano plot",
subtitle = "Cutaneous leishmaniasis",
caption=paste0("produced on ", Sys.time())) +
theme_bw()
ggplotly(vplot)To identify differentially expressed genes, precision weights were first applied to each gene based on its mean-variance relationship using VOOM, then data was normalized using the TMM method in EdgeR. Linear modeling and bayesian stats were employed via Limma to find genes that were up- or down-regulated in leishmania patients by 4-fold or more, with a false-discovery rate (FDR) of 0.01.
results <- decideTests(ebFit, method="global", adjust.method="BH", p.value=0.01, lfc=2)
colnames(v.DEGList.filtered.norm$E) <- sampleLabels
diffGenes <- v.DEGList.filtered.norm$E[results[,1] !=0,]
diffGenes.df <- as_tibble(diffGenes, rownames = "geneID")
datatable(diffGenes.df,
extensions = c('KeyTable', "FixedHeader"),
caption = 'Table 1: DEGs in cutaneous leishmaniasis',
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(2:11), digits=2)Pearson correlation was used to cluster 2314 differentially expressed genes, which were then represented as heatmap with the data scaled by Zscore for each row.
library(tidyverse)
library(gplots)
library(RColorBrewer)
myheatcolors <- rev(brewer.pal(name="RdBu", n=11))
clustRows <- hclust(as.dist(1-cor(t(diffGenes), method="pearson")), method="complete") #cluster rows by pearson correlation
clustColumns <- hclust(as.dist(1-cor(diffGenes, method="spearman")), method="complete")
module.assign <- cutree(clustRows, k=2)
module.color <- rainbow(length(unique(module.assign)), start=0.1, end=0.9)
module.color <- module.color[as.vector(module.assign)]
heatmap.2(diffGenes,
Rowv=as.dendrogram(clustRows),
Colv=as.dendrogram(clustColumns),
RowSideColors=module.color,
col=myheatcolors, scale='row', labRow=NA,
density.info="none", trace="none",
cexRow=1, cexCol=1, margins=c(8,20))modulePick <- 2
myModule_up <- diffGenes[names(module.assign[module.assign %in% modulePick]),]
hrsub_up <- hclust(as.dist(1-cor(t(myModule_up), method="pearson")), method="complete")
heatmap.2(myModule_up,
Rowv=as.dendrogram(hrsub_up),
Colv=NA,
labRow = NA,
col=myheatcolors, scale="row",
density.info="none", trace="none",
RowSideColors=module.color[module.assign%in%modulePick], margins=c(8,20))modulePick <- 1
myModule_down <- diffGenes[names(module.assign[module.assign %in% modulePick]),]
hrsub_down <- hclust(as.dist(1-cor(t(myModule_down), method="pearson")), method="complete")
heatmap.2(myModule_down,
Rowv=as.dendrogram(hrsub_down),
Colv=NA,
labRow = NA,
col=myheatcolors, scale="row",
density.info="none", trace="none",
RowSideColors=module.color[module.assign%in%modulePick], margins=c(8,20))GO enrichment for the 15944 genes induced by infection
library(tidyverse)
library(limma)
library(gplots) #f or heatmaps
library(DT) # interactive and searchable tables of our GSEA results
library(GSEABase) # functions and methods for Gene Set Enrichment Analysis
library(Biobase) # base functions for bioconductor; required by GSEABase
library(GSVA) # GSVA, a non-parametric and unsupervised method for estimating variation of gene set enrichment across samples
library(gprofiler2) # tools for accessing the GO enrichment results using g:Profiler web resources
library(clusterProfiler) # provides a suite of tools for functional enrichment analysis
library(msigdbr) # access to msigdb collections directly within R
library(enrichplot) # great for making the standard GSEA enrichment plots
gost.res_up <- gost(rownames(myModule_up), organism = "hsapiens", correction_method = "fdr")
gostplot(gost.res_up, interactive = T, capped = T) # set interactive=FALSE to get plot for publicationshs_gsea_c2 <- msigdbr(species = "Homo sapiens", # change depending on species your data came from
category = "C2") %>% # choose your msigdb collection of interest
dplyr::select(gs_name, gene_symbol) # just get the columns corresponding to signature name and gene symbols of genes in each signature
# Now that you have your msigdb collections ready, prepare your data
# grab the dataframe you made in step3 script
# Pull out just the columns corresponding to gene symbols and LogFC for at least one pairwise comparison for the enrichment analysis
mydata.df.sub <- dplyr::select(mydata.df, geneID, LogFC)
mydata.gsea <- mydata.df.sub$LogFC
names(mydata.gsea) <- as.character(mydata.df.sub$geneID)
mydata.gsea <- sort(mydata.gsea, decreasing = TRUE)
# run GSEA using the 'GSEA' function from clusterProfiler
myGSEA.res <- GSEA(mydata.gsea, TERM2GENE=hs_gsea_c2, verbose=FALSE)
myGSEA.df <- as_tibble(myGSEA.res@result)
# view results as an interactive table
datatable(myGSEA.df,
extensions = c('KeyTable', "FixedHeader"),
caption = 'Signatures enriched in leishmaniasis',
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(3:10), digits=2)# create enrichment plots using the enrichplot package
gseaplot2(myGSEA.res,
geneSetID = 47, #can choose multiple signatures to overlay in this plot
pvalue_table = FALSE, #can set this to FALSE for a cleaner plot
title = myGSEA.res$Description[47]) #can also turn off this title# add a variable to this result that matches enrichment direction with phenotype
myGSEA.df <- myGSEA.df %>%
mutate(phenotype = case_when(
NES > 0 ~ "disease",
NES < 0 ~ "healthy"))
# create 'bubble plot' to summarize y signatures across x phenotypes
ggplot(myGSEA.df[1:20,], aes(x=phenotype, y=ID)) +
geom_point(aes(size=setSize, color = NES, alpha=-log10(p.adjust))) +
scale_color_gradient(low="blue", high="red") +
theme_bw()Describe the results in your own words. Some things to think about:
The output from running ‘sessionInfo’ is shown below and details all packages and version necessary to reproduce the results in this report.
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19041)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=French_France.1252
## [2] LC_CTYPE=French_France.1252
## [3] LC_MONETARY=French_France.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=French_France.1252
##
## attached base packages:
## [1] stats4 parallel stats graphics
## [5] grDevices utils datasets methods
## [9] base
##
## other attached packages:
## [1] enrichplot_1.8.1
## [2] msigdbr_7.1.1
## [3] clusterProfiler_3.16.0
## [4] gprofiler2_0.1.9
## [5] GSVA_1.36.0
## [6] GSEABase_1.50.0
## [7] graph_1.66.0
## [8] annotate_1.66.0
## [9] XML_3.99-0.3
## [10] RColorBrewer_1.1-2
## [11] gplots_3.0.3
## [12] plotly_4.9.2.1
## [13] gt_0.2.1
## [14] DT_0.13
## [15] cowplot_1.0.0
## [16] matrixStats_0.56.0
## [17] edgeR_3.30.0
## [18] limma_3.44.1
## [19] EnsDb.Hsapiens.v86_2.99.0
## [20] ensembldb_2.12.1
## [21] AnnotationFilter_1.12.0
## [22] GenomicFeatures_1.40.0
## [23] AnnotationDbi_1.50.0
## [24] Biobase_2.48.0
## [25] GenomicRanges_1.40.0
## [26] GenomeInfoDb_1.24.0
## [27] IRanges_2.22.1
## [28] S4Vectors_0.26.1
## [29] BiocGenerics_0.34.0
## [30] tximport_1.16.0
## [31] forcats_0.5.0
## [32] stringr_1.4.0
## [33] dplyr_0.8.5
## [34] purrr_0.3.4
## [35] readr_1.3.1
## [36] tidyr_1.0.3
## [37] tibble_3.0.1
## [38] ggplot2_3.3.0
## [39] tidyverse_1.3.0
## [40] knitr_1.28
## [41] tinytex_0.23
## [42] rmarkdown_2.2
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.0
## [2] RSQLite_2.2.0
## [3] htmlwidgets_1.5.1
## [4] grid_4.0.0
## [5] BiocParallel_1.22.0
## [6] scatterpie_0.1.4
## [7] munsell_0.5.0
## [8] withr_2.2.0
## [9] colorspace_1.4-2
## [10] GOSemSim_2.14.0
## [11] rstudioapi_0.11
## [12] DOSE_3.14.0
## [13] labeling_0.3
## [14] urltools_1.7.3
## [15] GenomeInfoDbData_1.2.3
## [16] polyclip_1.10-0
## [17] bit64_0.9-8
## [18] farver_2.0.3
## [19] rhdf5_2.32.0
## [20] downloader_0.4
## [21] vctrs_0.3.0
## [22] generics_0.0.2
## [23] xfun_0.14
## [24] BiocFileCache_1.12.0
## [25] R6_2.4.1
## [26] graphlayouts_0.7.0
## [27] locfit_1.5-9.4
## [28] bitops_1.0-6
## [29] fgsea_1.14.0
## [30] gridGraphics_0.5-0
## [31] DelayedArray_0.14.0
## [32] assertthat_0.2.1
## [33] promises_1.1.0
## [34] scales_1.1.1
## [35] ggraph_2.0.3
## [36] gtable_0.3.0
## [37] tidygraph_1.2.0
## [38] rlang_0.4.6
## [39] splines_4.0.0
## [40] rtracklayer_1.48.0
## [41] lazyeval_0.2.2
## [42] broom_0.5.6
## [43] europepmc_0.3
## [44] BiocManager_1.30.10
## [45] yaml_2.2.1
## [46] reshape2_1.4.4
## [47] modelr_0.1.8
## [48] crosstalk_1.1.0.1
## [49] backports_1.1.7
## [50] httpuv_1.5.2
## [51] qvalue_2.20.0
## [52] tools_4.0.0
## [53] ggplotify_0.0.5
## [54] ellipsis_0.3.1
## [55] ggridges_0.5.2
## [56] Rcpp_1.0.4.6
## [57] plyr_1.8.6
## [58] progress_1.2.2
## [59] zlibbioc_1.34.0
## [60] RCurl_1.98-1.2
## [61] prettyunits_1.1.1
## [62] openssl_1.4.1
## [63] viridis_0.5.1
## [64] SummarizedExperiment_1.18.1
## [65] haven_2.2.0
## [66] ggrepel_0.8.2
## [67] fs_1.4.1
## [68] magrittr_1.5
## [69] data.table_1.12.8
## [70] DO.db_2.9
## [71] triebeard_0.3.0
## [72] reprex_0.3.0
## [73] ProtGenerics_1.20.0
## [74] hms_0.5.3
## [75] mime_0.9.1
## [76] evaluate_0.14.1
## [77] xtable_1.8-4
## [78] readxl_1.3.1
## [79] gridExtra_2.3
## [80] compiler_4.0.0
## [81] biomaRt_2.44.0
## [82] KernSmooth_2.23-17
## [83] crayon_1.3.4
## [84] htmltools_0.4.0
## [85] later_1.0.0
## [86] snow_0.4-3
## [87] lubridate_1.7.8
## [88] DBI_1.1.0
## [89] tweenr_1.0.1
## [90] dbplyr_1.4.3
## [91] MASS_7.3-51.6
## [92] rappdirs_0.3.1
## [93] Matrix_1.3-0
## [94] cli_2.0.2
## [95] gdata_2.18.0
## [96] igraph_1.2.5
## [97] pkgconfig_2.0.3
## [98] rvcheck_0.1.8
## [99] GenomicAlignments_1.24.0
## [100] xml2_1.3.2
## [101] XVector_0.28.0
## [102] rvest_0.3.5
## [103] digest_0.6.25
## [104] Biostrings_2.56.0
## [105] cellranger_1.1.0
## [106] fastmatch_1.1-0
## [107] curl_4.3
## [108] shiny_1.4.0.2
## [109] Rsamtools_2.4.0
## [110] gtools_3.8.2
## [111] lifecycle_0.2.0
## [112] nlme_3.1-147
## [113] jsonlite_1.6.1
## [114] Rhdf5lib_1.10.0
## [115] viridisLite_0.3.0
## [116] askpass_1.1
## [117] fansi_0.4.1
## [118] pillar_1.4.4
## [119] lattice_0.20-41
## [120] fastmap_1.0.1
## [121] httr_1.4.1
## [122] GO.db_3.11.1
## [123] glue_1.4.1
## [124] shinythemes_1.1.2
## [125] bit_1.1-15.2
## [126] ggforce_0.3.1
## [127] stringi_1.4.6
## [128] blob_1.2.1
## [129] caTools_1.18.0
## [130] memoise_1.1.0